import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as py
import os
import tsfresh
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
23-ROC 24-LOC 27-ECG 33-Chest 34-ABD 35-Flow
def CreateDataset():
#list directories ofall files
files_list = []
for root, dirs, files in os.walk(path, topdown=False):
for name in files:
files_list.append(os.path.join(root, name))
#select directories of txt files (trascripts)
text_files=[ file for file in files_list if ".txt" in file]
return text_files
#Indika_OSA
path = "H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\5_secs_Indika\\OSA"
files = CreateDataset()
final = pd.read_csv('H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\5_secs_Indika\\OSA\\21.17.25.txt',sep='\t',header=None)
final.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
for file in files[1:]:
df = pd.read_csv(file,sep='\t',header=None)
df.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
final = final.append(df)
BOSA_I = final
BOSA_I[['Date','Time']] = BOSA_I.iloc[:,0].str.split(" ",expand=True,)
BOSA_I.drop(BOSA_I.columns[[0,41]], axis = 1, inplace = True)
BOSA_I.to_csv("BOSA_I.csv", index = False)
BOSA_I = pd.read_csv("BOSA_I.csv")
#Farzan_OSA
path = "H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\5_secs_Farzan\\OSA"
files = CreateDataset()
final = pd.read_csv('H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\5_secs_Farzan\\OSA\\1.34.16.txt',sep='\t',header=None)
final.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
for file in files[1:]:
df = pd.read_csv(file,sep='\t',header=None)
df.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
final = final.append(df)
BOSA_F = final
BOSA_F[['Date','Time']] = BOSA_F.iloc[:,0].str.split(" ",expand=True,)
BOSA_F.drop(BOSA_F.columns[[0,41]], axis = 1, inplace = True)
BOSA_F.to_csv("BOSA_F.csv", index = False)
BOSA_F = pd.read_csv("BOSA_F.csv")
#Shifaya_OSA
path = "H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\5_secs_Shifaya\\OSA"
files = CreateDataset()
final = pd.read_csv('H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\5_secs_Shifaya\\OSA\\0.24.40.txt',sep='\t',header=None)
final.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
for file in files[1:]:
df = pd.read_csv(file,sep='\t',header=None)
df.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
final = final.append(df)
BOSA_S = final
BOSA_S[['Date','Time']] = BOSA_S.iloc[:,0].str.split(" ",expand=True,)
BOSA_S.drop(BOSA_S.columns[[0,41]], axis = 1, inplace = True)
BOSA_S.to_csv("BOSA_S.csv", index = False)
BOSA_S = pd.read_csv("BOSA_S.csv")
#dropping unnecessary columns
BOSA_I.drop(['%CEB', 'C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C007', 'C008',
'C009', 'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017',
'C018', 'C019', 'C020', 'C021', 'C022', 'C025', 'C026','C028', 'C029', 'C030',
'C031', 'C032', 'C036', 'C037', 'C038','TRIGGER'], axis = 1, inplace=True)
BOSA_F.drop(['%CEB', 'C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C007', 'C008',
'C009', 'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017',
'C018', 'C019', 'C020', 'C021', 'C022', 'C025', 'C026','C028', 'C029', 'C030',
'C031', 'C032', 'C036', 'C037', 'C038','TRIGGER'], axis = 1, inplace=True)
BOSA_S.drop(['%CEB', 'C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C007', 'C008',
'C009', 'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017',
'C018', 'C019', 'C020', 'C021', 'C022', 'C025', 'C026','C028', 'C029', 'C030',
'C031', 'C032', 'C036', 'C037', 'C038','TRIGGER'], axis = 1, inplace=True)
#rename the columns
BOSA_I.rename(columns={'C023': 'ROC', 'C024': 'LOC', 'C027': 'ECG', 'C033': 'Chest', 'C034': 'ABD', 'C035': 'Flow'}, inplace=True)
BOSA_F.rename(columns={'C023': 'ROC', 'C024': 'LOC', 'C027': 'ECG', 'C033': 'Chest', 'C034': 'ABD', 'C035': 'Flow'}, inplace=True)
BOSA_S.rename(columns={'C023': 'ROC', 'C024': 'LOC', 'C027': 'ECG', 'C033': 'Chest', 'C034': 'ABD', 'C035': 'Flow'}, inplace=True)
BOSA = pd.concat([BOSA_I, BOSA_F, BOSA_S]) #merging the BOSA data of all patients
BOSA.set_index('Time')[['ROC', 'LOC', 'ECG', 'Chest', 'ABD', 'Flow']].plot(subplots=True)
array([<AxesSubplot:xlabel='Time'>, <AxesSubplot:xlabel='Time'>,
<AxesSubplot:xlabel='Time'>, <AxesSubplot:xlabel='Time'>,
<AxesSubplot:xlabel='Time'>, <AxesSubplot:xlabel='Time'>],
dtype=object)
#Indika_NonOSA
path = "H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\Indika\\NonOSA_Indika_30"
files = CreateDataset()
final = pd.read_csv('H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\Indika\\NonOSA_Indika_30\\21.16.35.txt',sep='\t',header=None)
final.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
for file in files[1:]:
df = pd.read_csv(file,sep='\t',header=None)
df.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
final = final.append(df)
BNOSA_I = final
BNOSA_I[['Date','Time']] = BNOSA_I.iloc[:,0].str.split(" ",expand=True,)
BNOSA_I.drop(BNOSA_I.columns[[0,41]], axis = 1, inplace = True)
BNOSA_I.to_csv("BNOSA_I.csv", index = False)
BNOSA_I = pd.read_csv("BNOSA_I.csv")
#Farzan_NonOSA
path = "H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\Farzan\\Non OSA"
files = CreateDataset()
final = pd.read_csv('H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\Farzan\\Non OSA\\00.04.13.txt',sep='\t',header=None)
final.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
for file in files[1:]:
df = pd.read_csv(file,sep='\t',header=None)
df.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
final = final.append(df)
BNOSA_F = final
BNOSA_F[['Date','Time']] = BNOSA_F.iloc[:,0].str.split(" ",expand=True,)
BNOSA_F.drop(BNOSA_F.columns[[0,41]], axis = 1, inplace = True)
BNOSA_F.to_csv("BNOSA_F.csv", index = False)
BNOSA_F = pd.read_csv("BNOSA_F.csv")
#Shifaya_NonOSA
path = "H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\Shifaya\\Non OSA"
files = CreateDataset()
final = pd.read_csv('H:\\UOM Final Research\\Workings\\Models\\NEW DATA\\Shifaya\\Non OSA\\00.25.40.txt',sep='\t',header=None)
final.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
for file in files[1:]:
df = pd.read_csv(file,sep='\t',header=None)
df.columns = [["date","%CEB","C001","C002","C003","C004","C005","C006",
"C007","C008","C009","C010","C011","C012","C013","C014",
"C015","C016","C017","C018","C019","C020","C021","C022",
"C023","C024","C025","C026","C027","C028","C029","C030",
"C031","C032","C033","C034","C035","C036","C037","C038","TRIGGER"]]
final = final.append(df)
BNOSA_S = final
BNOSA_S[['Date','Time']] = BNOSA_S.iloc[:,0].str.split(" ",expand=True,)
BNOSA_S.drop(BNOSA_S.columns[[0,41]], axis = 1, inplace = True)
BNOSA_S.to_csv("BNOSA_S.csv", index = False)
BNOSA_S = pd.read_csv("BNOSA_S.csv")
#dropping unnecessary columns
BNOSA_I.drop(['%CEB', 'C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C007', 'C008',
'C009', 'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017',
'C018', 'C019', 'C020', 'C021', 'C022', 'C025', 'C026','C028', 'C029', 'C030',
'C031', 'C032', 'C036', 'C037', 'C038','TRIGGER'], axis = 1, inplace=True)
BNOSA_F.drop(['%CEB', 'C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C007', 'C008',
'C009', 'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017',
'C018', 'C019', 'C020', 'C021', 'C022', 'C025', 'C026','C028', 'C029', 'C030',
'C031', 'C032', 'C036', 'C037', 'C038','TRIGGER'], axis = 1, inplace=True)
BNOSA_S.drop(['%CEB', 'C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C007', 'C008',
'C009', 'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017',
'C018', 'C019', 'C020', 'C021', 'C022', 'C025', 'C026','C028', 'C029', 'C030',
'C031', 'C032', 'C036', 'C037', 'C038','TRIGGER'], axis = 1, inplace=True)
#rename the columns
BNOSA_I.rename(columns={'C023': 'ROC', 'C024': 'LOC', 'C027': 'ECG', 'C033': 'Chest', 'C034': 'ABD', 'C035': 'Flow'}, inplace=True)
BNOSA_F.rename(columns={'C023': 'ROC', 'C024': 'LOC', 'C027': 'ECG', 'C033': 'Chest', 'C034': 'ABD', 'C035': 'Flow'}, inplace=True)
BNOSA_S.rename(columns={'C023': 'ROC', 'C024': 'LOC', 'C027': 'ECG', 'C033': 'Chest', 'C034': 'ABD', 'C035': 'Flow'}, inplace=True)
BNOSA = pd.concat([BNOSA_I, BNOSA_F, BNOSA_S]) #merging the BNOSA data of all patient
BNOSA.set_index('Time')[['ROC', 'LOC', 'ECG', 'Chest', 'ABD', 'Flow']].plot(subplots=True)
array([<AxesSubplot:xlabel='Time'>, <AxesSubplot:xlabel='Time'>,
<AxesSubplot:xlabel='Time'>, <AxesSubplot:xlabel='Time'>,
<AxesSubplot:xlabel='Time'>, <AxesSubplot:xlabel='Time'>],
dtype=object)
BOSA['OSA']=1 #adding dependent variable for OSA data
BNOSA['OSA']=0 #adding dependent variable for NonOSA data
Data = pd.concat([BOSA, BNOSA]) #merging_#1
print(Data.shape)
Data
(25600, 8)
| ROC | LOC | ECG | Chest | ABD | Flow | Time | OSA | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0449 | 0.1008 | -0.2198 | -0.2393 | -0.3371 | -0.1707 | 21:17:25 | 1 |
| 1 | 0.0383 | 0.0944 | -0.3791 | -0.2342 | -0.3326 | -0.1752 | 21:17:25 | 1 |
| 2 | 0.0088 | 0.0734 | -0.5173 | -0.2254 | -0.3262 | -0.1802 | 21:17:25 | 1 |
| 3 | -0.0064 | 0.0630 | -0.6013 | -0.2209 | -0.3206 | -0.1739 | 21:17:25 | 1 |
| 4 | 0.0120 | 0.0816 | -0.7568 | -0.2260 | -0.3182 | -0.1683 | 21:17:25 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2555 | -0.0452 | -0.0396 | -0.0128 | 0.0425 | 0.0526 | -0.1356 | 22:47:07 | 0 |
| 2556 | -0.0532 | -0.0473 | -0.0284 | 0.0452 | 0.0521 | -0.1356 | 22:47:07 | 0 |
| 2557 | -0.0561 | -0.0476 | -0.0354 | 0.0465 | 0.0558 | -0.1369 | 22:47:07 | 0 |
| 2558 | -0.0580 | -0.0439 | -0.0306 | 0.0534 | 0.0617 | -0.1332 | 22:47:07 | 0 |
| 2559 | -0.0532 | -0.0441 | -0.0266 | 0.0534 | 0.0678 | -0.1287 | 22:47:07 | 0 |
25600 rows × 8 columns
#add unique id for each one_second (256 milliseconds) time window
rows = 256
idlist = []
for n in range(1,101):
idlist = idlist + [n for i in range(rows)]
Data['id'] = idlist
#timeseries plot for ROC #OSA
fig, ax = plt.subplots(figsize=(20,10))
plt.plot(Data['ROC'].iloc[0:800], color='cyan', lw=2)
plt.xlabel('Time (Milliseconds)', fontsize=18)
plt.ylabel('')
plt.xticks(rotation=90, fontsize=14.5)
plt.yticks(fontsize=14.5)
x = np.arange(0, 800, 16)
plt.margins(x=0)
plt.xticks(x)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
#adding patches
line = plt.Rectangle((-1, -0.18), 257, 0.4, fill=False, color='red',lw=4, linestyle='dashed')
ax.add_patch(line)
line1 = plt.Rectangle((512, -0.18), 256, 0.4, fill=False, color='red',lw=4, linestyle='dashed')
ax.add_patch(line1)
rect = plt.Rectangle((0.5, -0.05), 256, 0.13, fill=False, color='lime',lw=4.5)
ax.add_patch(rect)
ax.text(85, -0.13, " Window Length\n(256 milliseconds)", color='yellow', fontsize=15, fontweight='bold')
ax.text(126, 0.17, "W1", color='yellow', fontsize=25, fontweight='bold',alpha=0.6)
ax.text(372, 0.17, "W2", color='yellow', fontsize=25, fontweight='bold',alpha=0.6)
ax.text(635, 0.17, "W3", color='yellow', fontsize=25, fontweight='bold',alpha=0.6)
arr = plt.Arrow(84,-0.12,-84,0, color = 'red', width=0.01)
ax.add_patch(arr)
arr1 = plt.Arrow(176,-0.12,80,0, color = 'red', width=0.01)
ax.add_patch(arr1)
plt.show()
#timeseries plot for ROC #OSA
fig, ax = plt.subplots(figsize=(20,5))
plt.plot(Data['ROC'].iloc[0:1400], color='cyan', lw=2)
plt.title('Signal patterns of Pre-OSA Segments', fontsize=20, pad=20)
plt.xlabel('')
plt.ylabel('ROC', fontsize=18)
plt.xticks([])
plt.yticks(fontsize=14.5)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
#adding patches
line = plt.Rectangle((0, -0.2), 256, 0.4, fill=False, color='red',lw=4, linestyle='dashed')
ax.add_patch(line)
line1 = plt.Rectangle((512, -0.2), 256, 0.4, fill=False, color='red',lw=4, linestyle='dashed')
ax.add_patch(line1)
line2 = plt.Rectangle((1024, -0.2), 256, 0.4, fill=False, color='red',lw=4, linestyle='dashed')
ax.add_patch(line2)
ax.text(93, 0.15, "W1", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(349, 0.15, "W2", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(605, 0.15, "W3", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(861, 0.15, "W4", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(1117, 0.15, "W5", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(1373, 0.15, "W6", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
plt.show()
#timeseries plot for ROC #NonOSA
fig, ax = plt.subplots(figsize=(20,5))
plt.plot(Data['ROC'].iloc[24200:25600], color='cyan', lw=2)
plt.title('Signal patterns of Regular Breathing Segments', fontsize=20, pad=20)
plt.xlabel('')
plt.ylabel('ROC', fontsize=18)
plt.xticks([])
plt.yticks(fontsize=14.5)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
#adding patches
line = plt.Rectangle((1792, -0.2), 256, 0.4, fill=False, color='red',lw=4, linestyle='dashed')
ax.add_patch(line)
line1 = plt.Rectangle((2304, -0.2), 256, 0.4, fill=False, color='red',lw=4, linestyle='dashed')
ax.add_patch(line1)
line2 = plt.Rectangle((1280, -0.2), 256, 0.4, fill=False, color='red',lw=4, linestyle='dashed')
ax.add_patch(line2)
ax.text(2400, 0.06, "W100", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(2144, 0.06, "W99", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(1888, 0.06, "W98", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(1632, 0.06, "W97", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(1376, 0.06, "W96", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
ax.text(1120, 0.06, "W95", color='yellow', fontsize=20, fontweight='bold',alpha=0.50)
plt.show()
#drop unnecessary columns
Data2 = Data.drop(columns = ['Time', 'OSA'])
print(Data2.shape)
Data2.head()
(25600, 7)
| ROC | LOC | ECG | Chest | ABD | Flow | id | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0449 | 0.1008 | -0.2198 | -0.2393 | -0.3371 | -0.1707 | 1 |
| 1 | 0.0383 | 0.0944 | -0.3791 | -0.2342 | -0.3326 | -0.1752 | 1 |
| 2 | 0.0088 | 0.0734 | -0.5173 | -0.2254 | -0.3262 | -0.1802 | 1 |
| 3 | -0.0064 | 0.0630 | -0.6013 | -0.2209 | -0.3206 | -0.1739 | 1 |
| 4 | 0.0120 | 0.0816 | -0.7568 | -0.2260 | -0.3182 | -0.1683 | 1 |
#making the target/dependent variable
target = Data[['Time', 'OSA']]
#row_reduction
target = target[255::256]
target.index = range(1,101)
print(target.shape)
target.head()
(100, 2)
| Time | OSA | |
|---|---|---|
| 1 | 21:17:25 | 1 |
| 2 | 21:17:54 | 1 |
| 3 | 21:18:40 | 1 |
| 4 | 21:19:14 | 1 |
| 5 | 21:19:41 | 1 |
target1 = target.drop(columns = ['Time'])
#Checking data imbalances
imb = target.OSA.value_counts().reset_index()
fig = plt.figure(figsize=(10,5))
ax = sns.catplot(x="index", y="OSA", data=imb,
kind="bar",height=3, aspect=2)
plt.title('Distribution')
ax.set(ylabel='Count', xlabel='OSA or Non_OSA')
print(imb)
plt.show()
index OSA 0 1 50 1 0 50
<Figure size 720x360 with 0 Axes>
#feature_extraction using tsfresh
extracted_features = extract_features(Data2, column_id='id')
print(extracted_features.shape)
extracted_features.head()
Feature Extraction: 100%|██████████| 20/20 [01:04<00:00, 3.20s/it]
(100, 4722)
| ROC__variance_larger_than_standard_deviation | ROC__has_duplicate_max | ROC__has_duplicate_min | ROC__has_duplicate | ROC__sum_values | ROC__abs_energy | ROC__mean_abs_change | ROC__mean_change | ROC__mean_second_derivative_central | ROC__median | ... | Flow__permutation_entropy__dimension_5__tau_1 | Flow__permutation_entropy__dimension_6__tau_1 | Flow__permutation_entropy__dimension_7__tau_1 | Flow__query_similarity_count__query_None__threshold_0.0 | Flow__matrix_profile__feature_"min"__threshold_0.98 | Flow__matrix_profile__feature_"max"__threshold_0.98 | Flow__matrix_profile__feature_"mean"__threshold_0.98 | Flow__matrix_profile__feature_"median"__threshold_0.98 | Flow__matrix_profile__feature_"25"__threshold_0.98 | Flow__matrix_profile__feature_"75"__threshold_0.98 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.0 | 0.0 | 0.0 | 1.0 | 2.9906 | 0.114112 | 0.017430 | -0.000045 | 0.000066 | 0.0120 | ... | 3.173318 | 3.794638 | 4.364281 | NaN | 29.120440 | 29.120440 | 29.120440 | 29.120440 | 29.120440 | 29.120440 |
| 2 | 0.0 | 0.0 | 1.0 | 1.0 | -21.8848 | 2.126018 | 0.017303 | -0.000270 | -0.000027 | -0.0828 | ... | 3.212197 | 3.998353 | 4.619276 | NaN | 2.683419 | 26.832816 | 8.106598 | 3.805005 | 3.149366 | 4.503578 |
| 3 | 0.0 | 0.0 | 0.0 | 1.0 | 27.5488 | 3.169981 | 0.017728 | 0.000377 | -0.000104 | 0.1074 | ... | 3.002860 | 3.506086 | 4.125488 | NaN | 29.120440 | 29.120440 | 29.120440 | 29.120440 | 29.120440 | 29.120440 |
| 4 | 0.0 | 0.0 | 0.0 | 1.0 | 7.3046 | 0.671557 | 0.017569 | -0.000501 | -0.000087 | 0.0375 | ... | 3.038049 | 3.832666 | 4.547601 | NaN | 4.550740 | 28.565714 | 27.659488 | 28.565714 | 28.565714 | 28.565714 |
| 5 | 0.0 | 0.0 | 0.0 | 1.0 | -14.0514 | 0.939084 | 0.017413 | 0.000282 | 0.000081 | -0.0540 | ... | 3.081210 | 3.710032 | 4.351462 | NaN | 29.120440 | 29.120440 | 29.120440 | 29.120440 | 29.120440 | 29.120440 |
5 rows × 4722 columns
extracted_features.isnull().sum().sum()
886
extracted_features = extracted_features.dropna(axis=1) #removing the attributes with null_values
extracted_features.isnull().sum().sum()
0
#adding a dependant variable to the Extracted_features
extracted_features[['Time', 'OSA']] = target
extracted_features.head()
| ROC__variance_larger_than_standard_deviation | ROC__has_duplicate_max | ROC__has_duplicate_min | ROC__has_duplicate | ROC__sum_values | ROC__abs_energy | ROC__mean_abs_change | ROC__mean_change | ROC__mean_second_derivative_central | ROC__median | ... | Flow__fourier_entropy__bins_5 | Flow__fourier_entropy__bins_10 | Flow__fourier_entropy__bins_100 | Flow__permutation_entropy__dimension_3__tau_1 | Flow__permutation_entropy__dimension_4__tau_1 | Flow__permutation_entropy__dimension_5__tau_1 | Flow__permutation_entropy__dimension_6__tau_1 | Flow__permutation_entropy__dimension_7__tau_1 | Time | OSA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.0 | 0.0 | 0.0 | 1.0 | 2.9906 | 0.114112 | 0.017430 | -0.000045 | 0.000066 | 0.0120 | ... | 0.045395 | 0.045395 | 0.090729 | 1.593746 | 2.414030 | 3.173318 | 3.794638 | 4.364281 | 21:17:25 | 1 |
| 2 | 0.0 | 0.0 | 1.0 | 1.0 | -21.8848 | 2.126018 | 0.017303 | -0.000270 | -0.000027 | -0.0828 | ... | 0.045395 | 0.045395 | 0.181214 | 1.627601 | 2.511799 | 3.212197 | 3.998353 | 4.619276 | 21:17:54 | 1 |
| 3 | 0.0 | 0.0 | 0.0 | 1.0 | 27.5488 | 3.169981 | 0.017728 | 0.000377 | -0.000104 | 0.1074 | ... | 0.045395 | 0.045395 | 0.181214 | 1.615743 | 2.392304 | 3.002860 | 3.506086 | 4.125488 | 21:18:40 | 1 |
| 4 | 0.0 | 0.0 | 0.0 | 1.0 | 7.3046 | 0.671557 | 0.017569 | -0.000501 | -0.000087 | 0.0375 | ... | 0.045395 | 0.090729 | 0.215617 | 1.628181 | 2.440334 | 3.038049 | 3.832666 | 4.547601 | 21:19:14 | 1 |
| 5 | 0.0 | 0.0 | 0.0 | 1.0 | -14.0514 | 0.939084 | 0.017413 | 0.000282 | 0.000081 | -0.0540 | ... | 0.045395 | 0.045395 | 0.181214 | 1.620885 | 2.443665 | 3.081210 | 3.710032 | 4.351462 | 21:19:41 | 1 |
5 rows × 4685 columns
extracted_features.set_index('Time')[['Chest__skewness','Chest__standard_deviation']].plot(subplots=True)
array([<AxesSubplot:xlabel='Time'>, <AxesSubplot:xlabel='Time'>],
dtype=object)
plt.figure(figsize=(7,7))
plt.scatter(extracted_features['Chest__skewness'][(extracted_features.OSA == 1)| (extracted_features.OSA == 1)],
extracted_features['Chest__standard_deviation'][(extracted_features.OSA == 1)| (extracted_features.OSA == 1)],
marker='D',
color='red',
label='OSA')
plt.scatter(extracted_features['Chest__skewness'][extracted_features.OSA == 0],
extracted_features['Chest__standard_deviation'][extracted_features.OSA == 0],
marker='o',
color='blue',
label='Non_OSA')
plt.xlabel('Chest__skewness')
plt.ylabel('Chest__standard_deviation')
plt.legend()
plt.show()
#Chest
Chest_New = extracted_features.melt(id_vars=['Time', 'OSA'],
value_vars=['Chest__mean','Chest__sum_values', 'Chest__abs_energy', 'Chest__mean_second_derivative_central',
'Chest__median', 'Chest__standard_deviation', 'Chest__variation_coefficient', 'Chest__variance',
'Chest__skewness', 'Chest__kurtosis', 'Chest__root_mean_square', 'Chest__count_above_mean',
'Chest__count_below_mean','Chest__maximum', 'Chest__minimum'],
value_name='Chest',
var_name="TS_Feature")
#ROC
ROC_New = extracted_features.melt(id_vars=['Time', 'OSA'],
value_vars=['ROC__mean','ROC__sum_values', 'ROC__abs_energy', 'ROC__mean_second_derivative_central',
'ROC__median', 'ROC__standard_deviation', 'ROC__variation_coefficient', 'ROC__variance',
'ROC__skewness', 'ROC__kurtosis', 'ROC__root_mean_square', 'ROC__count_above_mean',
'ROC__count_below_mean','ROC__maximum', 'ROC__minimum'],
value_name='ROC',
var_name="TS_Feature")
#LOC
LOC_New = extracted_features.melt(id_vars=['Time', 'OSA'],
value_vars=['LOC__mean','LOC__sum_values', 'LOC__abs_energy', 'LOC__mean_second_derivative_central',
'LOC__median', 'LOC__standard_deviation', 'LOC__variation_coefficient', 'LOC__variance',
'LOC__skewness', 'LOC__kurtosis', 'LOC__root_mean_square', 'LOC__count_above_mean',
'LOC__count_below_mean','LOC__maximum', 'LOC__minimum'],
value_name='LOC',
var_name="TS_Feature")
#ECG
ECG_New = extracted_features.melt(id_vars=['Time', 'OSA'],
value_vars=['ECG__mean','ECG__sum_values', 'ECG__abs_energy', 'ECG__mean_second_derivative_central',
'ECG__median', 'ECG__standard_deviation', 'ECG__variation_coefficient', 'ECG__variance',
'ECG__skewness', 'ECG__kurtosis', 'ECG__root_mean_square', 'ECG__count_above_mean',
'ECG__count_below_mean','ECG__maximum', 'ECG__minimum'],
value_name='ECG',
var_name="TS_Feature")
#Flow
Flow_New = extracted_features.melt(id_vars=['Time', 'OSA'],
value_vars=['Flow__mean','Flow__sum_values', 'Flow__abs_energy', 'Flow__mean_second_derivative_central',
'Flow__median', 'Flow__standard_deviation', 'Flow__variation_coefficient', 'Flow__variance',
'Flow__skewness', 'Flow__kurtosis', 'Flow__root_mean_square', 'Flow__count_above_mean',
'Flow__count_below_mean','Flow__maximum', 'Flow__minimum'],
value_name='Flow',
var_name="TS_Feature")
#ABD
ABD_New = extracted_features.melt(id_vars=['Time', 'OSA'],
value_vars=['ABD__mean','ABD__sum_values', 'ABD__abs_energy', 'ABD__mean_second_derivative_central',
'ABD__median', 'ABD__standard_deviation', 'ABD__variation_coefficient', 'ABD__variance',
'ABD__skewness', 'ABD__kurtosis', 'ABD__root_mean_square', 'ABD__count_above_mean',
'ABD__count_below_mean','ABD__maximum', 'ABD__minimum'],
value_name='ABD',
var_name="TS_Feature")
All_Channels = Chest_New.merge(LOC_New['LOC'],
how='inner',left_index=True, right_index=True).merge(ROC_New['ROC'],
how='inner',left_index=True, right_index=True).merge(ECG_New['ECG'],
how='inner',left_index=True, right_index=True).merge(Flow_New['Flow'],
how='inner',left_index=True, right_index=True).merge(ABD_New['ABD'],
how='inner',left_index=True, right_index=True)
print(All_Channels.shape)
All_Channels.head()
(1500, 9)
| Time | OSA | TS_Feature | Chest | LOC | ROC | ECG | Flow | ABD | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 21:17:25 | 1 | Chest__mean | -0.056870 | 0.076241 | 0.011682 | 0.001161 | -0.029242 | -0.016866 |
| 1 | 21:17:54 | 1 | Chest__mean | -0.007290 | -0.024610 | -0.085487 | -0.078925 | -0.125155 | 0.122939 |
| 2 | 21:18:40 | 1 | Chest__mean | 0.009320 | -0.020929 | 0.107612 | 0.066500 | -0.099652 | 0.229364 |
| 3 | 21:19:14 | 1 | Chest__mean | 0.071706 | -0.056943 | 0.028534 | -0.006053 | 0.069187 | 0.249368 |
| 4 | 21:19:41 | 1 | Chest__mean | 0.295169 | 0.039795 | -0.054888 | -0.013921 | 0.077239 | 0.685883 |
#ABD_mean plot for 50 pre OSA and 50 NonOSA segments
fig, ax = plt.subplots(figsize=(20,5))
plt.plot(All_Channels['ABD'].iloc[0:51], color='red', lw=3) #pre OSA segments
plt.plot(All_Channels['ABD'].iloc[50:100], color='cyan', lw=2) #NonOSA segments
plt.xlabel('Pre OSA/ Regular breathing Segments', fontsize=20)
plt.ylabel('ABD Mean', fontsize=18)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(fontsize=16)
x = np.arange(0, 100, 2)
plt.margins(x=0)
plt.xticks(x)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
plt.show()
#Flow_mean plot for 50 pre OSA and 50 NonOSA segments
fig, ax = plt.subplots(figsize=(20,5))
plt.plot(All_Channels['Flow'].iloc[0:51], color='red', lw=3) #OSA segments
plt.plot(All_Channels['Flow'].iloc[50:100], color='cyan', lw=2) #NonOSA segments
plt.xlabel('Pre OSA/ Regular breathing Segments', fontsize=20)
plt.ylabel('Flow Mean', fontsize=18)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(fontsize=16)
x = np.arange(0, 100, 2)
plt.margins(x=0)
plt.xticks(x)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
plt.show()
#Chest_mean plot for 50 pre OSA and 50 NonOSA segments
fig, ax = plt.subplots(figsize=(20,5))
plt.plot(All_Channels['Chest'].iloc[0:51], color='red', lw=3) #OSA segments
plt.plot(All_Channels['Chest'].iloc[50:100], color='cyan', lw=2) #NonOSA segments
plt.xlabel('Pre OSA/ Regular breathing Segments', fontsize=20)
plt.ylabel('Chest Mean', fontsize=18)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(fontsize=16)
x = np.arange(0, 100, 2)
plt.margins(x=0)
plt.xticks(x)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
plt.show()
#ECG_mean plot for 50 pre OSA and 50 NonOSA segments
fig, ax = plt.subplots(figsize=(20,5))
plt.plot(All_Channels['ECG'].iloc[0:51], color='red', lw=3) #OSA segments
plt.plot(All_Channels['ECG'].iloc[50:100], color='cyan', lw=2) #NonOSA segments
plt.xlabel('Pre OSA/ Regular breathing Segments', fontsize=20)
plt.ylabel('ECG Mean', fontsize=18)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(fontsize=16)
x = np.arange(0, 100, 2)
plt.margins(x=0)
plt.xticks(x)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
plt.show()
#ROC_mean plot for 50 pre OSA and 50 NonOSA segments
fig, ax = plt.subplots(figsize=(20,5))
plt.plot(All_Channels['ROC'].iloc[0:51], color='red', lw=3) #OSA segments
plt.plot(All_Channels['ROC'].iloc[50:100], color='cyan', lw=2) #NonOSA segments
plt.xlabel('Pre OSA/ Regular breathing Segments', fontsize=20)
plt.ylabel('ROC Mean', fontsize=18)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(fontsize=16)
x = np.arange(0, 100, 2)
plt.margins(x=0)
plt.xticks(x)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
plt.show()
#LOC_mean plot for 50 pre OSA and 50 NonOSA segments
fig, ax = plt.subplots(figsize=(20,5))
plt.plot(All_Channels['LOC'].iloc[0:51], color='red', lw=3) #OSA segments
plt.plot(All_Channels['LOC'].iloc[50:100], color='cyan', lw=2) #NonOSA segments
plt.xlabel('Pre OSA/ Regular breathing Segments', fontsize=20)
plt.ylabel('LOC Mean', fontsize=18)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(fontsize=16)
x = np.arange(0, 100, 2)
plt.margins(x=0)
plt.xticks(x)
ax.set_facecolor('darkslategrey')
ax.xaxis.labelpad = 20
plt.grid(True, lw=0.3)
plt.show()
X = All_Channels[['Chest', 'LOC', 'ROC', 'ECG', 'Flow', 'ABD']] #independent variables
y = All_Channels['OSA'] #dependent variable
X.head()
| Chest | LOC | ROC | ECG | Flow | ABD | |
|---|---|---|---|---|---|---|
| 0 | -0.056870 | 0.076241 | 0.011682 | 0.001161 | -0.029242 | -0.016866 |
| 1 | -0.007290 | -0.024610 | -0.085487 | -0.078925 | -0.125155 | 0.122939 |
| 2 | 0.009320 | -0.020929 | 0.107612 | 0.066500 | -0.099652 | 0.229364 |
| 3 | 0.071706 | -0.056943 | 0.028534 | -0.006053 | 0.069187 | 0.249368 |
| 4 | 0.295169 | 0.039795 | -0.054888 | -0.013921 | 0.077239 | 0.685883 |
y.head()
0 1 1 1 2 1 3 1 4 1 Name: OSA, dtype: int64
print(X.shape)
print(y.shape)
(1500, 6) (1500,)
#correlation matrix with heatmap
Var_Corr = X.corr()
sns.set(font_scale=1.5)
plt.figure(figsize=(8,6))
sns.heatmap(Var_Corr, xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True, linewidths=2)
plt.show()
#checking the correlation between variables with a pair plot
plt.figure(figsize=(8,8))
sns.set(font_scale=1.5)
sns.pairplot(X)
plt.show()
<Figure size 576x576 with 0 Axes>
#data_spliting
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
print("X train : ",X_train.shape,"\nX test : ",X_test.shape,"\nY train : ",y_train.shape,"\nY test : ",y_test.shape)
X train : (1050, 6) X test : (450, 6) Y train : (1050,) Y test : (450,)
Normalize_scaler = Normalizer() #normalizer scaler
Norm_X= Normalize_scaler.fit_transform(X)
Norm_X_train= Normalize_scaler.fit_transform(X_train)
Norm_X_test= Normalize_scaler.transform(X_test)
Normalize_scaler
Normalizer()
clf_1 = svm.SVC(kernel = 'linear') #linear
model_1 = clf_1.fit(X_train, y_train) #linear
prediction_1 = model_1.predict(X_test) #linear
print('Accuracy of SVM without normalized data #linear : %.3f' % accuracy_score(y_test, prediction_1))
print('Precision : %.3f' % precision_score(y_test, prediction_1))
print('Recall : %.3f' % recall_score(y_test, prediction_1))
print('F1 Score : %.3f' % f1_score(y_test, prediction_1))
Accuracy of SVM without normalized data #linear : 0.529 Precision : 0.563 Recall : 0.181 F1 Score : 0.274
clf_1a = SVC(kernel = 'rbf', random_state = 0) #non_linear
model_1a = clf_1a.fit(X_train, y_train) #non_linear
prediction_1a = model_1a.predict(X_test) #non_linear
print('Accuracy of SVM without normalized data #non-linear : %.3f' % accuracy_score(y_test, prediction_1a))
print('Precision : %.3f' % precision_score(y_test, prediction_1a))
print('Recall : %.3f' % recall_score(y_test, prediction_1a))
print('F1 Score : %.3f' % f1_score(y_test, prediction_1a))
Accuracy of SVM without normalized data #non-linear : 0.529 Precision : 0.547 Recall : 0.235 F1 Score : 0.329
clf_2 = svm.SVC(kernel = 'linear') #linear
model_2 = clf_2.fit(Norm_X_train, y_train) #linear
prediction_2 = model_2.predict(Norm_X_test) #linear
cv_svm = cross_validate(clf_2, X, y, cv=10, scoring = "accuracy") #cross_validaiton
print('Accuracy of SVM with normalized data #linear : %.3f' % accuracy_score(y_test, prediction_2))
print('Accuracy of SVM #linear #Cross_validation : %.3f' % np.mean(cv_svm["test_score"]))
print('Precision : %.3f' % precision_score(y_test, prediction_2))
print('Recall : %.3f' % recall_score(y_test, prediction_2))
print('F1 Score : %.3f' % f1_score(y_test, prediction_2))
Accuracy of SVM with normalized data #linear : 0.547 Accuracy of SVM #linear #Cross_validation : 0.518 Precision : 0.532 Recall : 0.643 F1 Score : 0.582
clf_2a = SVC(kernel = 'rbf', random_state = 0) #non_linear
model_2a = clf_2a.fit(Norm_X_train, y_train) #non_linear
prediction_2a = model_2a.predict(Norm_X_test) #non_linear
cv_svm2 = cross_validate(clf_2a, X, y, cv=10, scoring = "accuracy") #cross_validaiton
cv_svm2a = cross_validate(clf_2a, X, y, cv=5, scoring = "accuracy") #cross_validaiton
print('Accuracy of SVM with normalized data #non-linear : %.3f' % accuracy_score(y_test, prediction_2a))
print('Accuracy of SVM #non-linear #Cross_validation : %.3f' % np.mean(cv_svm2["test_score"]))
print('Accuracy of SVM #non-linear #Cross_validation #k=5 : %.3f' % np.mean(cv_svm2a["test_score"]))
print('Precision : %.3f' % precision_score(y_test, prediction_2a))
print('Recall : %.3f' % recall_score(y_test, prediction_2a))
print('F1 Score : %.3f' % f1_score(y_test, prediction_2a))
Accuracy of SVM with normalized data #non-linear : 0.593 Accuracy of SVM #non-linear #Cross_validation : 0.513 Accuracy of SVM #non-linear #Cross_validation #k=5 : 0.492 Precision : 0.590 Recall : 0.566 F1 Score : 0.577
clf_3= RandomForestClassifier()
model_3 = clf_3.fit(X_train, y_train)
prediction_3 = model_3.predict(X_test)
cv_rf1 = cross_validate(clf_3, X, y, cv=10, scoring = "accuracy") #cross_validaiton
print('Accuracy of Random Forest without normalized data : %.3f' % accuracy_score(y_test, prediction_3))
print('Accuracy of Random Forest #Cross_validation : %.3f' % np.mean(cv_rf1["test_score"]))
print('Precision : %.3f' % precision_score(y_test, prediction_3))
print('Recall : %.3f' % recall_score(y_test, prediction_3))
print('F1 Score : %.3f' % f1_score(y_test, prediction_3))
Accuracy of Random Forest without normalized data : 0.673 Accuracy of Random Forest #Cross_validation : 0.623 Precision : 0.667 Recall : 0.670 F1 Score : 0.668
clf_4= RandomForestClassifier()
model_4 = clf_4.fit(Norm_X_train, y_train)
prediction_4 = model_4.predict(Norm_X_test)
cv_rf2 = cross_validate(clf_4, X, y, cv=10, scoring = "accuracy") #cross_validaiton
print('Accuracy of Random Forest with normalized data : %.3f' % accuracy_score(y_test, prediction_4))
print('Accuracy of Random Forest #Cross_validation : %.3f' % np.mean(cv_rf2["test_score"]))
print('Precision : %.3f' % precision_score(y_test, prediction_4))
print('Recall : %.3f' % recall_score(y_test, prediction_4))
print('F1 Score : %.3f' % f1_score(y_test, prediction_4))
Accuracy of Random Forest with normalized data : 0.627 Accuracy of Random Forest #Cross_validation : 0.631 Precision : 0.621 Recall : 0.615 F1 Score : 0.618
Ks = 40
error_rate = []
# Will take some time
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
Minimum error:- 0.37555555555555553 at K = 13
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
yhat = neigh.predict(X_test)
acc.append(metrics.accuracy_score(y_test, yhat))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color = 'blue',linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy:-",max(acc),"at K =",acc.index(max(acc)))
Maximum accuracy:- 0.6244444444444445 at K = 13
clf_5 = KNeighborsClassifier(n_neighbors = 14)
model_5 = clf_5.fit(X_train, y_train)
prediction_5 = model_5.predict(X_test)
print('Accuracy of KNN without normalized data : %.3f' % accuracy_score(y_test, prediction_5))
print('Precision : %.3f' % precision_score(y_test, prediction_5))
print('Recall : %.3f' % recall_score(y_test, prediction_5))
print('F1 Score : %.3f' % f1_score(y_test, prediction_5))
Accuracy of KNN without normalized data : 0.624 Precision : 0.630 Recall : 0.570 F1 Score : 0.599
clf_6 = KNeighborsClassifier(n_neighbors = 14)
model_6 = clf_6.fit(Norm_X_train, y_train)
prediction_6 = model_6.predict(Norm_X_test)
cv_kNN = cross_validate(clf_6, X, y, cv=10, scoring = "accuracy") #cross_validaiton
print('Accuracy of KNN with normalized data : %.3f' % accuracy_score(y_test, prediction_6))
print('Accuracy of KNN #Cross_validation : %.3f' % np.mean(cv_kNN["test_score"]))
print('Precision : %.3f' % precision_score(y_test, prediction_6))
print('Recall : %.3f' % recall_score(y_test, prediction_6))
print('F1 Score : %.3f' % f1_score(y_test, prediction_6))
Accuracy of KNN with normalized data : 0.600 Accuracy of KNN #Cross_validation : 0.554 Precision : 0.608 Recall : 0.520 F1 Score : 0.561
print("Accuracy of SVM without normalized data #linear : ",accuracy_score(y_test,prediction_1),
"\nAccuracy of SVM with normalized data #linear : ",accuracy_score(y_test,prediction_2),
"\nAccuracy of SVM without normalized data #non_linear : ",accuracy_score(y_test,prediction_1a),
"\nAccuracy of SVM with normalized data #non_linear : ",accuracy_score(y_test,prediction_2a),
"\nAccuracy of SVM #linear #Cross_validation : ",np.mean(cv_svm["test_score"]),
"\nAccuracy of SVM #non_linear #Cross_validation : ",np.mean(cv_svm2["test_score"]),
"\nAccuracy of Random Forest without normalized data : ",accuracy_score(y_test,prediction_3),
"\nAccuracy of Random Forest with normalized data : ",accuracy_score(y_test,prediction_4),
"\nAccuracy of Random Forest #Cross_validation : ",np.mean(cv_rf1["test_score"]),
"\nAccuracy of KNN without normalized data : ",accuracy_score(y_test,prediction_5),
"\nAccuracy of KNN with normalized data : ",accuracy_score(y_test,prediction_6),
"\nAccuracy of KNN #Cross_validation : ",np.mean(cv_kNN["test_score"]),
)
Accuracy of SVM without normalized data #linear : 0.5288888888888889 Accuracy of SVM with normalized data #linear : 0.5466666666666666 Accuracy of SVM without normalized data #non_linear : 0.5288888888888889 Accuracy of SVM with normalized data #non_linear : 0.5933333333333334 Accuracy of SVM #linear #Cross_validation : 0.518 Accuracy of SVM #non_linear #Cross_validation : 0.5133333333333334 Accuracy of Random Forest without normalized data : 0.6733333333333333 Accuracy of Random Forest with normalized data : 0.6266666666666667 Accuracy of Random Forest #Cross_validation : 0.6233333333333333 Accuracy of KNN without normalized data : 0.6244444444444445 Accuracy of KNN with normalized data : 0.6 Accuracy of KNN #Cross_validation : 0.554
x = ['SVM (Linear)', 'SVM (Non_linear)', 'Random Forest', 'K-NN']
fig = py.Figure(data=[py.Bar(name = 'Precision', x = x, y = [precision_score(y_test, prediction_2), precision_score(y_test, prediction_2a), precision_score(y_test, prediction_4), precision_score(y_test, prediction_6)]),
py.Bar(name = 'Recall', x = x, y = [recall_score(y_test, prediction_2), recall_score(y_test, prediction_2a), recall_score(y_test, prediction_4), recall_score(y_test, prediction_6)]),
py.Bar(name = 'F1_Score', x = x, y = [f1_score(y_test, prediction_2), f1_score(y_test, prediction_2a), f1_score(y_test, prediction_4), f1_score(y_test, prediction_6)]),
py.Bar(name = 'Accuracy_Score', x = x, y = [accuracy_score(y_test, prediction_2), accuracy_score(y_test, prediction_2a), accuracy_score(y_test, prediction_4), accuracy_score(y_test, prediction_6)])])
fig.update_layout(font_size=14, xaxis_title="", yaxis_title="")
fig.update_xaxes(dict(tickfont = dict(size=15, color='black')), tickprefix="<b>")
fig.show()
print("SVM with normalized data #linear :\n",classification_report(y_test,prediction_2),
"\n\nSVM with normalized data #non_linear :\n",classification_report(y_test,prediction_2a),
"\n\nRandom Forest with normalized data :\n",classification_report(y_test,prediction_4),
"\n\nKNN with normalized data :\n",classification_report(y_test,prediction_6),
)
SVM with normalized data #linear :
precision recall f1-score support
0 0.57 0.45 0.50 229
1 0.53 0.64 0.58 221
accuracy 0.55 450
macro avg 0.55 0.55 0.54 450
weighted avg 0.55 0.55 0.54 450
SVM with normalized data #non_linear :
precision recall f1-score support
0 0.60 0.62 0.61 229
1 0.59 0.57 0.58 221
accuracy 0.59 450
macro avg 0.59 0.59 0.59 450
weighted avg 0.59 0.59 0.59 450
Random Forest with normalized data :
precision recall f1-score support
0 0.63 0.64 0.63 229
1 0.62 0.62 0.62 221
accuracy 0.63 450
macro avg 0.63 0.63 0.63 450
weighted avg 0.63 0.63 0.63 450
KNN with normalized data :
precision recall f1-score support
0 0.59 0.68 0.63 229
1 0.61 0.52 0.56 221
accuracy 0.60 450
macro avg 0.60 0.60 0.60 450
weighted avg 0.60 0.60 0.60 450
print("SVM with normalized data #linear :\n",confusion_matrix(y_test,prediction_2),
"\n\nSVM with normalized data #non_linear :\n",confusion_matrix(y_test,prediction_2a),
"\n\nRandom Forest with normalized data :\n",confusion_matrix(y_test,prediction_4),
"\n\nKNN with normalized data :\n",confusion_matrix(y_test,prediction_6),
)
SVM with normalized data #linear : [[104 125] [ 79 142]] SVM with normalized data #non_linear : [[142 87] [ 96 125]] Random Forest with normalized data : [[146 83] [ 85 136]] KNN with normalized data : [[155 74] [106 115]]
Features=['Chest', 'LOC', 'ROC', 'ECG', 'Flow', 'ABD']
feature_imp=pd.Series(clf_4.feature_importances_, index=Features).sort_values(ascending=False)
Feature_Importance=feature_imp
bestFeatures=feature_imp.nlargest(10)
Feature_Importance
ABD 0.185911 Flow 0.168248 Chest 0.167796 ROC 0.163097 ECG 0.159476 LOC 0.155472 dtype: float64
fig = px.bar(x=bestFeatures.index, y=bestFeatures, color=bestFeatures, color_continuous_scale='portland')
fig.update_layout(font_size=14, xaxis_title="", yaxis_title="Feature Importance Score")
fig.update_xaxes(tickfont = dict(size=16, color='black'), tickprefix="<b>")
fig.show()